Week 2 - Regression and model validation

Read the data

rm(list = ls()) # clear workspace first
learning2014 <- read.csv("~/Documents/GitHub/IODS-project/learning2014.csv") # my own data wrangling
# read.table("~/Documents/GitHub/IODS-project/data/learning2014.txt", sep = ",") # Kimmo's data

Explore the structure and dimensions of the dataset

str(learning2014)
## 'data.frame':    166 obs. of  7 variables:
##  $ gender  : Factor w/ 2 levels "F","M": 1 2 1 2 2 1 2 1 2 1 ...
##  $ Age     : int  53 55 49 53 49 38 50 37 37 42 ...
##  $ Attitude: int  37 31 25 35 37 38 35 29 38 21 ...
##  $ deep    : num  3.58 2.92 3.5 3.5 3.67 ...
##  $ stra    : num  3.38 2.75 3.62 3.12 3.62 ...
##  $ surf    : num  2.58 3.17 2.25 2.25 2.83 ...
##  $ Points  : int  25 12 24 10 22 21 21 31 24 26 ...
dim(learning2014)
## [1] 166   7

The data includes 7 variables and 166 observations. The variables are: 1. Gender (Female = 1 Male = 2) 2. Age 3. Attitude 4-6. Mean scores of the deep, strategic and surface learning 7. Exam points

library(ggplot2) # Access the gglot2 library

Show a graphical overview of the data

# show gender distributions as bar graph
p1 <- ggplot(learning2014, aes(gender))
p1 + geom_bar()

# display variable distributions as histogram
p2 <- ggplot(learning2014, aes(Age))
p2 + geom_histogram(binwidth = 5)

p3 <- ggplot(learning2014, aes(Attitude))
p3 + geom_histogram(binwidth = 2)

p4 <- ggplot(learning2014, aes(deep))
p4 + geom_histogram(binwidth = 0.5)

p5 <- ggplot(learning2014, aes(stra))
p5 + geom_histogram(binwidth = 0.5)

p6 <- ggplot(learning2014, aes(surf))
p6 + geom_histogram(binwidth = 0.5)

p7 <- ggplot(learning2014, aes(Points))
p7 + geom_histogram(binwidth = 2)

# show relationships between variables 
p8 <- ggplot(learning2014, aes(x = Attitude, y = Points, col=gender))
p8 + geom_point() + ggtitle("Relationship between exam points and deep learning") + geom_smooth(method = "lm")

p9 <- ggplot(learning2014, aes(x = deep, y = Points, col=gender))
p9 + geom_point() + ggtitle("Relationship between exam points and deep learning") + geom_smooth(method = "lm")

p10 <- ggplot(learning2014, aes(x = stra, y = Points, col=gender))
p10 + geom_point() + ggtitle("Relationship between exam points and strategic learning") + geom_smooth(method = "lm")

p11 <- ggplot(learning2014, aes(x = surf, y = Points, col=gender))
p11 + geom_point() + ggtitle("Relationship between exam points and surface learning") + geom_smooth(method = "lm")

p12 <- ggplot(learning2014, aes(x = Age, y = Points, col=gender))
p12 + geom_point() + ggtitle("Relationship between age and exam points") + geom_smooth(method = "lm")

p13 <- ggplot(learning2014, aes(x = Age, y = Attitude, col=gender))
p13 + geom_point() + ggtitle("Relationship between age and attitudes") + geom_smooth(method = "lm")

library(GGally)
pairs(learning2014[-1], col = learning2014$gender)

p <- ggpairs(learning2014, mapping = aes(col=gender, alpha = 0.3), lower = list(combo = wrap("facethist", bins = 20)))

# draw the plot
p

Summary of the variables

summary(learning2014)
##  gender       Age           Attitude          deep            stra      
##  F:110   Min.   :17.00   Min.   :14.00   Min.   :1.583   Min.   :1.250  
##  M: 56   1st Qu.:21.00   1st Qu.:26.00   1st Qu.:3.333   1st Qu.:2.625  
##          Median :22.00   Median :32.00   Median :3.667   Median :3.188  
##          Mean   :25.51   Mean   :31.43   Mean   :3.680   Mean   :3.121  
##          3rd Qu.:27.00   3rd Qu.:37.00   3rd Qu.:4.083   3rd Qu.:3.625  
##          Max.   :55.00   Max.   :50.00   Max.   :4.917   Max.   :5.000  
##       surf           Points     
##  Min.   :1.583   Min.   : 7.00  
##  1st Qu.:2.417   1st Qu.:19.00  
##  Median :2.833   Median :23.00  
##  Mean   :2.787   Mean   :22.72  
##  3rd Qu.:3.167   3rd Qu.:27.75  
##  Max.   :4.333   Max.   :33.00
# another way of summarising the variables
#library(dplyr)
#learning2014 %>%
#  group_by(gender) %>%
#  summarise(mean = mean(Attitude), n = n())

The sample consist of mainly female participants. The majority of participants are between 20 to 30 years old. The variables are mostly normally distributed. The attitudes predict the exam points, but the learning strategies do not explain the exam points. Age does not explain the exam points or attitudes.

Fitting of a regression model to study whether attitudes, strategic and surface learning strategies explain the exam points. The learning strategies (strategic and surface) did not explain the exam points significantly. The exam points were significantly (p = 4.12e-09) explained by attitudes

# fit a linear model
my_model1 <- lm(Points ~ Attitude + stra + surf, data = learning2014) # how to use three explanatory vars
summary(my_model1)
## 
## Call:
## lm(formula = Points ~ Attitude + stra + surf, data = learning2014)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.1550  -3.4346   0.5156   3.6401  10.8952 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.01711    3.68375   2.991  0.00322 ** 
## Attitude     0.33952    0.05741   5.913 1.93e-08 ***
## stra         0.85313    0.54159   1.575  0.11716    
## surf        -0.58607    0.80138  -0.731  0.46563    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.296 on 162 degrees of freedom
## Multiple R-squared:  0.2074, Adjusted R-squared:  0.1927 
## F-statistic: 14.13 on 3 and 162 DF,  p-value: 3.156e-08
my_model2 <- lm(Points ~ Attitude, data = learning2014) 
summary(my_model2)
## 
## Call:
## lm(formula = Points ~ Attitude, data = learning2014)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.9763  -3.2119   0.4339   4.1534  10.6645 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.63715    1.83035   6.358 1.95e-09 ***
## Attitude     0.35255    0.05674   6.214 4.12e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.32 on 164 degrees of freedom
## Multiple R-squared:  0.1906, Adjusted R-squared:  0.1856 
## F-statistic: 38.61 on 1 and 164 DF,  p-value: 4.119e-09

Produce the diagnostic plots: Residuals vs Fitted values, Normal QQ-plot and Residuals vs Leverage

plot(my_model2)